Loading in the libraries.


In [62]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline

# Our new libraries.
from sklearn import datasets
from mpl_toolkits.mplot3d import Axes3D
import mayavi.mlab as mlab

iris = datasets.load_iris()

Looking at the data


In [63]:
print iris['DESCR']


Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris datasets.
http://archive.ics.uci.edu/ml/datasets/Iris

The famous Iris database, first used by Sir R.A Fisher

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

References
----------
   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...


In [64]:
iris


Out[64]:
{'DESCR': 'Iris Plants Database\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThis is a copy of UCI ML iris datasets.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\nThe famous Iris database, first used by Sir R.A Fisher\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\nReferences\n----------\n   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...\n',
 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
        [ 4.7,  3.2,  1.6,  0.2],
        [ 4.8,  3.1,  1.6,  0.2],
        [ 5.4,  3.4,  1.5,  0.4],
        [ 5.2,  4.1,  1.5,  0.1],
        [ 5.5,  4.2,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5. ,  3.2,  1.2,  0.2],
        [ 5.5,  3.5,  1.3,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 4.4,  3. ,  1.3,  0.2],
        [ 5.1,  3.4,  1.5,  0.2],
        [ 5. ,  3.5,  1.3,  0.3],
        [ 4.5,  2.3,  1.3,  0.3],
        [ 4.4,  3.2,  1.3,  0.2],
        [ 5. ,  3.5,  1.6,  0.6],
        [ 5.1,  3.8,  1.9,  0.4],
        [ 4.8,  3. ,  1.4,  0.3],
        [ 5.1,  3.8,  1.6,  0.2],
        [ 4.6,  3.2,  1.4,  0.2],
        [ 5.3,  3.7,  1.5,  0.2],
        [ 5. ,  3.3,  1.4,  0.2],
        [ 7. ,  3.2,  4.7,  1.4],
        [ 6.4,  3.2,  4.5,  1.5],
        [ 6.9,  3.1,  4.9,  1.5],
        [ 5.5,  2.3,  4. ,  1.3],
        [ 6.5,  2.8,  4.6,  1.5],
        [ 5.7,  2.8,  4.5,  1.3],
        [ 6.3,  3.3,  4.7,  1.6],
        [ 4.9,  2.4,  3.3,  1. ],
        [ 6.6,  2.9,  4.6,  1.3],
        [ 5.2,  2.7,  3.9,  1.4],
        [ 5. ,  2. ,  3.5,  1. ],
        [ 5.9,  3. ,  4.2,  1.5],
        [ 6. ,  2.2,  4. ,  1. ],
        [ 6.1,  2.9,  4.7,  1.4],
        [ 5.6,  2.9,  3.6,  1.3],
        [ 6.7,  3.1,  4.4,  1.4],
        [ 5.6,  3. ,  4.5,  1.5],
        [ 5.8,  2.7,  4.1,  1. ],
        [ 6.2,  2.2,  4.5,  1.5],
        [ 5.6,  2.5,  3.9,  1.1],
        [ 5.9,  3.2,  4.8,  1.8],
        [ 6.1,  2.8,  4. ,  1.3],
        [ 6.3,  2.5,  4.9,  1.5],
        [ 6.1,  2.8,  4.7,  1.2],
        [ 6.4,  2.9,  4.3,  1.3],
        [ 6.6,  3. ,  4.4,  1.4],
        [ 6.8,  2.8,  4.8,  1.4],
        [ 6.7,  3. ,  5. ,  1.7],
        [ 6. ,  2.9,  4.5,  1.5],
        [ 5.7,  2.6,  3.5,  1. ],
        [ 5.5,  2.4,  3.8,  1.1],
        [ 5.5,  2.4,  3.7,  1. ],
        [ 5.8,  2.7,  3.9,  1.2],
        [ 6. ,  2.7,  5.1,  1.6],
        [ 5.4,  3. ,  4.5,  1.5],
        [ 6. ,  3.4,  4.5,  1.6],
        [ 6.7,  3.1,  4.7,  1.5],
        [ 6.3,  2.3,  4.4,  1.3],
        [ 5.6,  3. ,  4.1,  1.3],
        [ 5.5,  2.5,  4. ,  1.3],
        [ 5.5,  2.6,  4.4,  1.2],
        [ 6.1,  3. ,  4.6,  1.4],
        [ 5.8,  2.6,  4. ,  1.2],
        [ 5. ,  2.3,  3.3,  1. ],
        [ 5.6,  2.7,  4.2,  1.3],
        [ 5.7,  3. ,  4.2,  1.2],
        [ 5.7,  2.9,  4.2,  1.3],
        [ 6.2,  2.9,  4.3,  1.3],
        [ 5.1,  2.5,  3. ,  1.1],
        [ 5.7,  2.8,  4.1,  1.3],
        [ 6.3,  3.3,  6. ,  2.5],
        [ 5.8,  2.7,  5.1,  1.9],
        [ 7.1,  3. ,  5.9,  2.1],
        [ 6.3,  2.9,  5.6,  1.8],
        [ 6.5,  3. ,  5.8,  2.2],
        [ 7.6,  3. ,  6.6,  2.1],
        [ 4.9,  2.5,  4.5,  1.7],
        [ 7.3,  2.9,  6.3,  1.8],
        [ 6.7,  2.5,  5.8,  1.8],
        [ 7.2,  3.6,  6.1,  2.5],
        [ 6.5,  3.2,  5.1,  2. ],
        [ 6.4,  2.7,  5.3,  1.9],
        [ 6.8,  3. ,  5.5,  2.1],
        [ 5.7,  2.5,  5. ,  2. ],
        [ 5.8,  2.8,  5.1,  2.4],
        [ 6.4,  3.2,  5.3,  2.3],
        [ 6.5,  3. ,  5.5,  1.8],
        [ 7.7,  3.8,  6.7,  2.2],
        [ 7.7,  2.6,  6.9,  2.3],
        [ 6. ,  2.2,  5. ,  1.5],
        [ 6.9,  3.2,  5.7,  2.3],
        [ 5.6,  2.8,  4.9,  2. ],
        [ 7.7,  2.8,  6.7,  2. ],
        [ 6.3,  2.7,  4.9,  1.8],
        [ 6.7,  3.3,  5.7,  2.1],
        [ 7.2,  3.2,  6. ,  1.8],
        [ 6.2,  2.8,  4.8,  1.8],
        [ 6.1,  3. ,  4.9,  1.8],
        [ 6.4,  2.8,  5.6,  2.1],
        [ 7.2,  3. ,  5.8,  1.6],
        [ 7.4,  2.8,  6.1,  1.9],
        [ 7.9,  3.8,  6.4,  2. ],
        [ 6.4,  2.8,  5.6,  2.2],
        [ 6.3,  2.8,  5.1,  1.5],
        [ 6.1,  2.6,  5.6,  1.4],
        [ 7.7,  3. ,  6.1,  2.3],
        [ 6.3,  3.4,  5.6,  2.4],
        [ 6.4,  3.1,  5.5,  1.8],
        [ 6. ,  3. ,  4.8,  1.8],
        [ 6.9,  3.1,  5.4,  2.1],
        [ 6.7,  3.1,  5.6,  2.4],
        [ 6.9,  3.1,  5.1,  2.3],
        [ 5.8,  2.7,  5.1,  1.9],
        [ 6.8,  3.2,  5.9,  2.3],
        [ 6.7,  3.3,  5.7,  2.5],
        [ 6.7,  3. ,  5.2,  2.3],
        [ 6.3,  2.5,  5. ,  1.9],
        [ 6.5,  3. ,  5.2,  2. ],
        [ 6.2,  3.4,  5.4,  2.3],
        [ 5.9,  3. ,  5.1,  1.8]]),
 'feature_names': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'target_names': array(['setosa', 'versicolor', 'virginica'], 
       dtype='|S10')}

In [65]:
X = iris['data']
y = iris['target']

In [66]:
py.plot(X[y==0,0],X[y==0,1],'r.')
py.plot(X[y==1,0],X[y==1,1],'g.')
py.plot(X[y==2,0],X[y==2,1],'b.')


Out[66]:
[<matplotlib.lines.Line2D at 0x63021940>]

In [67]:
py.plot(X[y==0,2],X[y==0,3],'r.')
py.plot(X[y==1,2],X[y==1,3],'g.')
py.plot(X[y==2,2],X[y==2,3],'b.')


Out[67]:
[<matplotlib.lines.Line2D at 0x6319b5f8>]

In [68]:
fig = py.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(X[y==0, 0], X[y==0, 1], X[y==0, 2], c='r')
ax.scatter(X[y==1, 0], X[y==1, 1], X[y==1, 2], c='g')
ax.scatter(X[y==2, 0], X[y==2, 1], X[y==2, 2], c='b')
py.show()



In [69]:
mlab.clf()
mlab.points3d(X[y==0, 0], X[y==0, 1], X[y==0, 2],color=(1,0,0))
mlab.points3d(X[y==1, 0], X[y==1, 1], X[y==1, 2],color=(0,1,0))
mlab.points3d(X[y==2, 0], X[y==2, 1], X[y==2, 2],color=(0,0,1))
mlab.axes()
mlab.show()

Is there a more principled way to look at the data? Yes! Let's go back to the notes.

More principled ways to look at the data, Principle Component Analysis (PCA)!

Some sample data to demonstrate PCA on.


In [70]:
mu1 = np.array([0,0,0])
mu2 = np.array([6,0,0])
np.random.seed(123)
Sigma = np.matrix(np.random.normal(size=[3,3]))
# U,E,VT = np.linalg.svd(Sigma)
# E[0] = 1
# E[1] = 1
# E[2] = 1
# Sigma = U*np.diag(E)*VT
Xrandom1 = np.random.multivariate_normal(mu1,np.array(Sigma*Sigma.T),size=500)
Xrandom2 = np.random.multivariate_normal(mu2,np.array(Sigma*Sigma.T),size=500)

Plot the data so that it is "spread out" as much as possible.


In [71]:
mlab.clf()
mlab.points3d(Xrandom1[:,0], Xrandom1[:,1], Xrandom1[:,2],color=(1,0,0))
mlab.points3d(Xrandom2[:,0], Xrandom2[:,1], Xrandom2[:,2],color=(0,1,0))
mlab.axes()
mlab.show()

Can do the same thing with our classification data.


In [72]:
from sklearn.decomposition import PCA

X2D = PCA(n_components=3).fit_transform(X)
py.plot(X2D[y==0,0],X2D[y==0,1],'r.')
py.plot(X2D[y==1,0],X2D[y==1,1],'g.')
py.plot(X2D[y==2,0],X2D[y==2,1],'b.')


Out[72]:
[<matplotlib.lines.Line2D at 0x5f0c6898>]

Just as one can project from a high dimensional space to a two-dimensional space, one can also do the same thing to project to a three-dimensional space.


In [73]:
fig = py.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X3D = PCA(n_components=3).fit_transform(X)
ax.scatter(X3D[y==0, 0], X3D[y==0, 1], X3D[y==0, 2], c='r')
ax.scatter(X3D[y==1, 0], X3D[y==1, 1], X3D[y==1, 2], c='g')
ax.scatter(X3D[y==2, 0], X3D[y==2, 1], X3D[y==2, 2], c='b')
py.show()


And do the same with Mayavi.


In [74]:
mlab.clf()
mlab.points3d(X3D[y==0, 0], X3D[y==0, 1], X3D[y==0, 2],color=(1,0,0))
mlab.points3d(X3D[y==1, 0], X3D[y==1, 1], X3D[y==1, 2],color=(0,1,0))
mlab.points3d(X3D[y==2, 0], X3D[y==2, 1], X3D[y==2, 2],color=(0,0,1))
mlab.axes()
mlab.show()

Let's go back to the notes for our first algorithm.

Our first classification tool, Linear Support Vector Machines.


In [75]:
# Load in the support vector machine (SVM) library
from sklearn import svm

In [77]:
# If there is one thing that I want to harp on, it is the difference
# between testing and training errors!  So, here we create a training
# set on which we computer the parameters of our algorithm, and a 
# testing set for seeing how well we generalize (and work on real 
# world problems).
np.random.seed(1236)
perm = np.random.permutation(len(y))
trainSize = 100
Xtrain = X[perm[:trainSize],0:2]
Xtest = X[perm[trainSize:],0:2]

yHat = np.zeros([len(y)])

# Exists a separator
#yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
#yHat[np.logical_or(y==1,y==0)] = 1
# All the data
yHat = y 

yHattrain = yHat[perm[:trainSize]]
yHattest = yHat[perm[trainSize:]]

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

But why do you do this? See the notes.


In [78]:
# Some parameters we can get to play with
# If there is no perfect separator then how much do you penalize points
# that lay on the wrong side?
C = 100.
# The shape of the loss function for points that lay on the wrong side.
loss = 'l2'

In [79]:
# Run the calculation!
clf = svm.LinearSVC(loss=loss,C=C)
clf.fit(Xtrain, yHattrain)


C:\Users\randy_000\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\svm\classes.py:197: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
Out[79]:
LinearSVC(C=100.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [80]:
# Make some plots, inspired by scikit-learn tutorial
from matplotlib.colors import ListedColormap

# step size in the mesh for plotting the decision boundary.
h = .02  
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=yHattrain, cmap=cmap_bold,marker='o')
py.scatter(Xtest[:, 0], Xtest[:, 1], c=yHattest, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()



In [81]:
# Print out some metrics
print 'training score',clf.score(Xtrain,yHattrain)
print 'testing score',clf.score(Xtest,yHattest)


training score 0.76
testing score 0.7

Back to the notes to define our next method.

Our second classification tool, K-nearest neighbors.


In [82]:
# Import the K-NN solver
from sklearn import neighbors

In [83]:
# If there is one thing that I want to harp on, it is the difference
# between testing and training errors!  So, here we create a training
# set on which we computer the parameters of our algorithm, and a 
# testing set for seeing how well we generalize (and work on real 
# world problems).
np.random.seed(123)
perm = np.random.permutation(len(y))
trainSize = 50
Xtrain = X[perm[:trainSize],0:2]
Xtest = X[perm[trainSize:],0:2]

ytrain = y[perm[:trainSize]]
ytest = y[perm[trainSize:]]

In [84]:
# Some parameters to play around with

# The number of neighbors to use.
n_neighbors = 7

#weights = 'distance'
weights = 'uniform'

In [85]:
# Run the calculation
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(Xtrain, ytrain)


Out[85]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [86]:
# Make some plots inspired by sci-kit learn tutorial

# step size in the mesh for plotting the decision boundary.
h = .02  

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
py.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, cmap=cmap_bold,marker='o')
py.scatter(Xtest[:, 0], Xtest[:, 1], c=ytest, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()



In [87]:
# Print out some scores.
print 'training score',clf.score(Xtrain,ytrain)
print 'testing score',clf.score(Xtest,ytest)


training score 0.9
testing score 0.69

Back to the notes.

Loading in the libraries for regression.


In [2]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa

# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab

Supervised Regression

Linear Regression


In [3]:
# Read in the data using 
Xy = pa.read_csv('Advertising.csv')

In [4]:
# Take a look at the contents.
Xy


Out[4]:
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9
5 6 8.7 48.9 75.0 7.2
6 7 57.5 32.8 23.5 11.8
7 8 120.2 19.6 11.6 13.2
8 9 8.6 2.1 1.0 4.8
9 10 199.8 2.6 21.2 10.6
10 11 66.1 5.8 24.2 8.6
11 12 214.7 24.0 4.0 17.4
12 13 23.8 35.1 65.9 9.2
13 14 97.5 7.6 7.2 9.7
14 15 204.1 32.9 46.0 19.0
15 16 195.4 47.7 52.9 22.4
16 17 67.8 36.6 114.0 12.5
17 18 281.4 39.6 55.8 24.4
18 19 69.2 20.5 18.3 11.3
19 20 147.3 23.9 19.1 14.6
20 21 218.4 27.7 53.4 18.0
21 22 237.4 5.1 23.5 12.5
22 23 13.2 15.9 49.6 5.6
23 24 228.3 16.9 26.2 15.5
24 25 62.3 12.6 18.3 9.7
25 26 262.9 3.5 19.5 12.0
26 27 142.9 29.3 12.6 15.0
27 28 240.1 16.7 22.9 15.9
28 29 248.8 27.1 22.9 18.9
29 30 70.6 16.0 40.8 10.5
... ... ... ... ... ...
170 171 50.0 11.6 18.4 8.4
171 172 164.5 20.9 47.4 14.5
172 173 19.6 20.1 17.0 7.6
173 174 168.4 7.1 12.8 11.7
174 175 222.4 3.4 13.1 11.5
175 176 276.9 48.9 41.8 27.0
176 177 248.4 30.2 20.3 20.2
177 178 170.2 7.8 35.2 11.7
178 179 276.7 2.3 23.7 11.8
179 180 165.6 10.0 17.6 12.6
180 181 156.6 2.6 8.3 10.5
181 182 218.5 5.4 27.4 12.2
182 183 56.2 5.7 29.7 8.7
183 184 287.6 43.0 71.8 26.2
184 185 253.8 21.3 30.0 17.6
185 186 205.0 45.1 19.6 22.6
186 187 139.5 2.1 26.6 10.3
187 188 191.1 28.7 18.2 17.3
188 189 286.0 13.9 3.7 15.9
189 190 18.7 12.1 23.4 6.7
190 191 39.5 41.1 5.8 10.8
191 192 75.5 10.8 6.0 9.9
192 193 17.2 4.1 31.6 5.9
193 194 166.8 42.0 3.6 19.6
194 195 149.7 35.6 6.0 17.3
195 196 38.2 3.7 13.8 7.6
196 197 94.2 4.9 8.1 9.7
197 198 177.0 9.3 6.4 12.8
198 199 283.6 42.0 66.2 25.5
199 200 232.1 8.6 8.7 13.4

200 rows × 5 columns


In [5]:
# Normalize data
# We do this to make plotting and processing easier.  Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization.  Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands.  One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
Xy


Out[5]:
Unnamed: 0 TV Radio Newspaper Sales
0 0.000000 0.775786 0.762097 0.605981 0.807087
1 0.005025 0.148123 0.792339 0.394019 0.346457
2 0.010050 0.055800 0.925403 0.606860 0.303150
3 0.015075 0.509976 0.832661 0.511873 0.665354
4 0.020101 0.609063 0.217742 0.510994 0.444882
5 0.025126 0.027054 0.985887 0.656992 0.220472
6 0.030151 0.192087 0.661290 0.204046 0.401575
7 0.035176 0.404126 0.395161 0.099384 0.456693
8 0.040201 0.026716 0.042339 0.006157 0.125984
9 0.045226 0.673318 0.052419 0.183817 0.354331
10 0.050251 0.221170 0.116935 0.210202 0.275591
11 0.055276 0.723706 0.483871 0.032542 0.622047
12 0.060302 0.078120 0.707661 0.576957 0.299213
13 0.065327 0.327359 0.153226 0.060686 0.318898
14 0.070352 0.687859 0.663306 0.401935 0.685039
15 0.075377 0.658438 0.961694 0.462621 0.818898
16 0.080402 0.226919 0.737903 1.000000 0.429134
17 0.085427 0.949273 0.798387 0.488127 0.897638
18 0.090452 0.231654 0.413306 0.158311 0.381890
19 0.095477 0.495773 0.481855 0.165347 0.511811
20 0.100503 0.736219 0.558468 0.467018 0.645669
21 0.105528 0.800473 0.102823 0.204046 0.429134
22 0.110553 0.042273 0.320565 0.433597 0.157480
23 0.115578 0.769699 0.340726 0.227792 0.547244
24 0.120603 0.208319 0.254032 0.158311 0.318898
25 0.125628 0.886710 0.070565 0.168865 0.409449
26 0.130653 0.480893 0.590726 0.108179 0.527559
27 0.135678 0.809604 0.336694 0.198769 0.562992
28 0.140704 0.839026 0.546371 0.198769 0.681102
29 0.145729 0.236388 0.322581 0.356201 0.350394
... ... ... ... ... ...
170 0.854271 0.166723 0.233871 0.159191 0.267717
171 0.859296 0.553940 0.421371 0.414248 0.507874
172 0.864322 0.063916 0.405242 0.146878 0.236220
173 0.869347 0.567129 0.143145 0.109938 0.397638
174 0.874372 0.749746 0.068548 0.112577 0.389764
175 0.879397 0.934055 0.985887 0.364996 1.000000
176 0.884422 0.837673 0.608871 0.175901 0.732283
177 0.889447 0.573216 0.157258 0.306948 0.397638
178 0.894472 0.933378 0.046371 0.205805 0.401575
179 0.899497 0.557660 0.201613 0.152155 0.433071
180 0.904523 0.527224 0.052419 0.070361 0.350394
181 0.909548 0.736557 0.108871 0.238347 0.417323
182 0.914573 0.187690 0.114919 0.258575 0.279528
183 0.919598 0.970240 0.866935 0.628848 0.968504
184 0.924623 0.855935 0.429435 0.261214 0.629921
185 0.929648 0.690903 0.909274 0.169745 0.826772
186 0.934673 0.469395 0.042339 0.231310 0.342520
187 0.939698 0.643896 0.578629 0.157432 0.618110
188 0.944724 0.964829 0.280242 0.029903 0.562992
189 0.949749 0.060873 0.243952 0.203166 0.200787
190 0.954774 0.131214 0.828629 0.048373 0.362205
191 0.959799 0.252959 0.217742 0.050132 0.326772
192 0.964824 0.055800 0.082661 0.275286 0.169291
193 0.969849 0.561718 0.846774 0.029024 0.708661
194 0.974874 0.503889 0.717742 0.050132 0.618110
195 0.979899 0.126818 0.074597 0.118734 0.236220
196 0.984925 0.316199 0.098790 0.068602 0.318898
197 0.989950 0.596212 0.187500 0.053650 0.440945
198 0.994975 0.956713 0.846774 0.579595 0.940945
199 1.000000 0.782550 0.173387 0.073879 0.464567

200 rows × 5 columns


In [6]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV']]
y = Xy.ix[:,['Sales']]

In [7]:
# Last time we did this by hand, now we are smarter and use the sklearn 
# routine.  This routine splits data into training and testing subsets.
cross_validation.train_test_split([1,2,3,4,5],
                                  [6,7,8,9,10],
                                  test_size=0.4,
                                  random_state=5)


Out[7]:
[[2, 3, 4], [5, 1], [7, 8, 9], [10, 6]]

In [8]:
# Now we do it for the real data.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8)

In [9]:
# Let's take a quick look at the data.
X_train


Out[9]:
TV
58 0.710517
93 0.846128
82 0.252283
65 0.230977
154 0.632736
79 0.389922
188 0.964829
110 0.761245
89 0.368955
86 0.255665
114 0.262090
48 0.765979
156 0.315184
81 0.808590
8 0.026716
144 0.322962
53 0.615150
43 0.697328
83 0.228948
137 0.923233
189 0.060873
38 0.143389
47 0.808928
138 0.143050
192 0.055800
80 0.256003
171 0.553940
128 0.742645
35 0.980724
195 0.126818
141 0.652689
181 0.736557
36 0.900237
131 0.894488
168 0.726074
160 0.580994
41 0.596212
50 0.673318
95 0.549882
173 0.567129

In [10]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)


Out[10]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# There are the slope and intercept of the line we computed.
# Beta_0
print reg.intercept_
# Beta_1
print reg.coef_


[ 0.25908445]
[[ 0.46707459]]

In [12]:
# Do a plot
plotX = np.linspace(0,1,100)
plotY = reg.predict(np.matrix(plotX).T)
py.plot(X_train,y_train,'ro')
py.plot(X_test,y_test,'go')
py.plot(plotX,plotY,'b-')


Out[12]:
[<matplotlib.lines.Line2D at 0x197eec50>]

In [13]:
# Use the metrics package to print our errors.  See discussion on slides.
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))


training error
0.0216501521541
testing error
0.0157389506667

Back to slides.

Multi-dimensional regression


In [14]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV','Radio']]
y = Xy.ix[:,['Sales']]

# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [15]:
# Plot the data to get a feel for it.
mlab.clf()
mlab.points3d(X_train.ix[:,0]/X.ix[:,0].std(), 
              X_train.ix[:,1]/X.ix[:,1].std(), 
              y_train.ix[:,0]/y.ix[:,0].std(),
              color=(1,0,0), scale_factor=0.2)
mlab.points3d(X_test.ix[:,0]/X.ix[:,0].std(), 
              X_test.ix[:,1]/X.ix[:,1].std(), 
              y_test.ix[:,0]/y.ix[:,0].std(),
              color=(0,1,0), scale_factor=0.2)
mlab.axes()
mlab.show()

In [16]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)


Out[16]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
# Create data for plotting
size=10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))
np.array([xPlot.flatten(),yPlot.flatten()])
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
                                           yPlot.flatten()])))
zPlot = zPlot.reshape([size,size])

In [18]:
# Since we will be plotting many times, we 
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
    mlab.clf()
    mlab.points3d(X_train.ix[:,0], 
                  X_train.ix[:,1], 
                  y_train.ix[:,0],
                  color=(1,0,0), scale_factor=scale_factor)
    mlab.points3d(X_test.ix[:,0], 
                  X_test.ix[:,1], 
                  y_test.ix[:,0],
                  color=(0,1,0), scale_factor=scale_factor)
    mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
    mlab.axes()
    mlab.show()
    
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)

In [19]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))


training error
0.00301813848558
testing error
0.00529973748665

Back to the notes.

Non-linear fitting


In [20]:
# Now we try non-linear fittng.  See notes for details.  
# Note that we add a new column which is a *non-linear* function
# of the original data!
XyNonlinear = Xy.copy()
XyNonlinear['TV*Radio'] = Xy['TV']*Xy['Radio']

# Select out our predictor columns and our response columns
X = XyNonlinear.ix[:,['TV','Radio','TV*Radio']]
y = XyNonlinear.ix[:,['Sales']]

# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [21]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)


Out[21]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
# Create data for plotting
size = 10

xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
                                           yPlot.flatten(),
                                           (xPlot*yPlot).flatten()])))
zPlot = zPlot.reshape([size,size])

In [23]:
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)

In [24]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))


training error
0.000954956008276
testing error
0.00149444601649

Back to the notes.

Too much of a good thing...


In [25]:
# What about adding many non-linear combinations!  See notes for details.

degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])

for i in range(degree):
    for j in range(degree):
        XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
        
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [26]:
# Run the solver
regOver = linear_model.LinearRegression(fit_intercept=True)
regOver.fit(X_train,y_train)


Out[26]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
print regOver.intercept_
print regOver.coef_


[-6.94708351]
[[    0.            60.55943866  -166.07190603   184.75679655
    -72.32883146    58.42979358  -484.12475427  1297.94802252
  -1408.49267643   538.88687257  -170.96380513  1414.03394469
  -3717.84399455  3955.89719887 -1483.95231001   216.58492539
  -1787.88418163  4651.20425434 -4892.06944782  1814.15721393
   -100.38566969   829.96108308 -2151.36644682  2254.46299276
   -833.66782306]]

In [28]:
# Create data for plotting
size = 10

xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))

tmp = []
for i in range(degree):
    for j in range(degree):
        tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )

zPlot = regOver.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])

In [29]:
# Plot the data

# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
                                                             test_size=0.8,
                                                             random_state=123)

myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)

In [30]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regOver.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regOver.predict(X_test))


training error
3.41438151096e-05
testing error
0.348469042526

Back to notes.

Model Selection


In [31]:
# Fortunately, there is a *lot* that one can do to help.  It is possible to have
# many predictors but still get good answers.  See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])

names = []
for i in range(degree):
    for j in range(degree):
        XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
        names.append('TV**%d*Radio**%d'%(i,j))

# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [32]:
# We can try None and 3 to see what we get.
selector = feature_selection.RFE(regOver,n_features_to_select=3)
selector.fit(X_train,y_train)


C:\Users\randy_000\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\utils\validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[32]:
RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
  estimator_params=None, n_features_to_select=3, step=1, verbose=0)

In [33]:
# Print out the predictors we use.  These are the predictors selection by the RFE algorithm
# as the most important.
for i in range(len(names)):
    print names[i],
    print selector.get_support()[i]


TV**0*Radio**0 False
TV**0*Radio**1 False
TV**0*Radio**2 False
TV**0*Radio**3 False
TV**0*Radio**4 False
TV**1*Radio**0 False
TV**1*Radio**1 False
TV**1*Radio**2 False
TV**1*Radio**3 False
TV**1*Radio**4 False
TV**2*Radio**0 False
TV**2*Radio**1 True
TV**2*Radio**2 False
TV**2*Radio**3 False
TV**2*Radio**4 False
TV**3*Radio**0 False
TV**3*Radio**1 False
TV**3*Radio**2 False
TV**3*Radio**3 True
TV**3*Radio**4 True
TV**4*Radio**0 False
TV**4*Radio**1 False
TV**4*Radio**2 False
TV**4*Radio**3 False
TV**4*Radio**4 False

In [34]:
# Create data for plotting
size = 10

xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))

tmp = []
for i in range(degree):
    for j in range(degree):
        tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )

zPlot = selector.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])

In [35]:
# Plot the data

# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
                                                             test_size=0.8,
                                                             random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)

In [36]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,selector.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,selector.predict(X_test))


training error
0.00265007247837
testing error
0.00490752683194

Back to notes.

Lasso!


In [37]:
# Lasso regression is another method for doing feature selection.
# It is, by far, by favorite it is a close cousin of my personal
# research topic.  See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])

names = []
for i in range(degree):
    for j in range(degree):
        XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
        names.append('TV**%d*Radio**%d'%(i,j))
                     
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [38]:
# Run the solver
regLasso = linear_model.Lasso(alpha=0.002,fit_intercept=True,normalize=True)
regLasso.fit(X_train,y_train)


Out[38]:
Lasso(alpha=0.002, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [39]:
# Print out the predictors we use.  These betas with non-zero weights are those
# selected by the Lasso algorithm as being the most important.  What do you notice?
print regLasso.intercept_
for i in range(len(regLasso.coef_)):
    print names[i],regLasso.coef_[i]


[ 0.27046738]
TV**0*Radio**0 0.0
TV**0*Radio**1 0.0
TV**0*Radio**2 0.0
TV**0*Radio**3 0.0
TV**0*Radio**4 0.0
TV**1*Radio**0 0.135662574951
TV**1*Radio**1 0.670730481986
TV**1*Radio**2 0.0
TV**1*Radio**3 0.0
TV**1*Radio**4 0.0
TV**2*Radio**0 0.0
TV**2*Radio**1 0.0
TV**2*Radio**2 0.0
TV**2*Radio**3 0.0
TV**2*Radio**4 0.0
TV**3*Radio**0 0.0
TV**3*Radio**1 0.0
TV**3*Radio**2 0.0
TV**3*Radio**3 0.0
TV**3*Radio**4 0.0
TV**4*Radio**0 0.0
TV**4*Radio**1 0.0
TV**4*Radio**2 0.0
TV**4*Radio**3 0.0
TV**4*Radio**4 0.0

In [40]:
# Create data for plotting
size = 10

xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))

tmp = []
for i in range(degree):
    for j in range(degree):
        tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )

zPlot = regLasso.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])

In [41]:
# Plot the data

# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
                                                             test_size=0.8,
                                                             random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)

In [42]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regLasso.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regLasso.predict(X_test))


training error
0.00118345029043
testing error
0.002103671494

In [ ]:


In [ ]: